# Replication code for
# "Beyond the Mean: How Thinking About The Distribution of Public Opinions
#  Reduces Politicians' Perceptual Errors"
# by Nicholas Dias, Jack Lucas, and Lior Sheffer

# Silence warnings
options(warn = -1)

# Export log
sink("log.txt")

# Install groundhog package
install.packages("groundhog")

# Load groundhog package
library("groundhog")

# List needed packages
packages <- c(
  "BayesPostEst", "broom", "marginaleffects", "MASS", "patchwork", "tidyverse",
  "scales"
)

# Load package versions from R version 4.3.2
groundhog.library(packages, "2024-04-23")

# Overwrite select function
select <- dplyr::select

# Set seed
set.seed(8675309)

# Note to user: Set the appropriate working directory below
# setwd("")

#######################################################################
# Figure 2 in Main Text (Descriptive Overview of Perceptual Accuracy) #
#######################################################################

# Load data
## Survey data from 2020–2023
cmb.2020.to.2023 <- readRDS("data/cmb_2020_to_2023.RDS")

## Cross-survey averages
cmb.avg <- readRDS("data/cmb_avg.RDS")

## MRP estimates by method and year
mrp <- readRDS("data/mrp.RDS")

## Bayesian MRP estimates with propagated error
bayes.mrp.prop.error <- readRDS("data/bayes_mrp_prop_error.RDS")

###############
## Figure 2B ##
###############

# Iterate over 1,000 plausible, Bayesian, MRP estimates of local ideology
est <- bayes.mrp.prop.error %>%
  seq_along() %>%
  lapply(
    function(i) {
      # Stash MRP estimates
      mrp.tmp <- bayes.mrp.prop.error[[i]]

      # Join estimates with cross-survey averages
      cmb.tmp <- cmb.avg %>%
        left_join(mrp.tmp, by = "csduid")

      # Calculate error
      cmb.tmp <- cmb.tmp %>%
        mutate(per_error = lr_point_avg_resident - value)

      # Run model
      mod.tmp <- lm(per_error ~ 1, data = cmb.tmp) %>%
        tidy()

      # Store results
      data.frame(
        iteration = i,
        estimate = rnorm(1, mod.tmp$estimate, mod.tmp$std.error)
      )
    }
  )

# Estimate average error from Bayesian propagation along with credible intervals
est.bayes.prop.error <- est %>%
  bind_rows() %>%
  summarise(
    est = median(estimate),
    conf.low = quantile(estimate, probs = c(0.025)),
    conf.high = quantile(estimate, probs = c(0.975))
  ) %>%
  rename(estimate = est)

# Merge other MRP estimates to cross-survey averages
cmb.avg.with.mrps <- cmb.avg %>%
  left_join(mrp, by = "csduid")

# Calculate MLE error
cmb.avg.with.mrps <- cmb.avg.with.mrps %>%
  mutate(per_error = lr_point_avg_resident - mle_pooled)

# Merge other MRP estimates to 2020–2023 survey data
cmb.2020.to.2023.with.mrps <- cmb.2020.to.2023 %>%
  left_join(mrp, by = "csduid")

# Calculate MLE error for each year
cmb.2020.to.2023.with.mrps <- cmb.2020.to.2023.with.mrps %>%
  mutate(
    per_error = case_when(
      survey_year == 2020 ~ lr_point_avg_resident - mle_2020,
      survey_year == 2021 ~ lr_point_avg_resident - mle_2021,
      survey_year == 2022 ~ lr_point_avg_resident - mle_2022,
      survey_year == 2023 ~ lr_point_avg_resident - mle_2023
    )
  )

# Estimate average errors
## Cross-survey averages
est.avg <- lm(per_error ~ 1, data = cmb.avg.with.mrps) %>%
  tidy(conf.int = T)

## Across 2020–2023
est.2020.to.2023 <- lm(per_error ~ 1, data = cmb.2020.to.2023.with.mrps) %>%
  tidy(conf.int = T)

## By year
est.by.year <- cmb.2020.to.2023.with.mrps %>%
  group_by(survey_year) %>%
  do(
    model = lm(per_error ~ 1, data = .) %>%
      tidy(conf.int = T)
  ) %>%
  unnest(model)

# Bind all estimates together
est.all <- bind_rows(
  est.bayes.prop.error,
  est.avg,
  est.2020.to.2023,
  est.by.year,
)

# Label estimates
est.all <- est.all %>%
  mutate(
    type = c("Bayesian", "Average", "All Years", "2020", "2021", "2022", "2023")
  )

# Plot coefficients
fig.2b <- est.all %>%
  ggplot(
    aes(x = type, y = estimate, ymin = conf.low, ymax = conf.high)
  ) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_pointrange() +
  coord_flip() +
  xlab("") +
  ylab("Average: Politician's Estimate - MRP Estimate") +
  scale_y_continuous(limits = c(-1, 1), breaks = c(-1:1)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    panel.grid = element_blank()
  ) +
  ggtitle("B. Average Perceptual Error by Method and Year")

###############
## Figure 2A ##
###############

# Calculate proportion of politicians who are over-estimators
prop.over.zero <- data.frame(
  label = mean(cmb.avg.with.mrps$per_error > 0, na.rm = TRUE) %>%
    round(2)
)

# Plot distribution of point-estimate errors from cross-survey averages
fig.2a <- cmb.avg.with.mrps %>%
  ggplot(aes(x = per_error)) +
  geom_histogram() +
  geom_text(
    data = prop.over.zero,
    aes(x = -4, y = 200, label = percent(1 - label)),
    hjust = 0,
    size = 4
  ) +
  geom_text(
    data = prop.over.zero,
    aes(
      x = 4, y = 200, label = percent(label)
    ),
    hjust = 1,
    size = 4
  ) +
  geom_vline(xintercept = 0) +
  xlab("Politician's Estimate - MRP Estimate") +
  ylab("Count") +
  scale_x_continuous(limits = c(-4, 4), breaks = c(-4:4)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 225)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    panel.grid = element_blank()
  ) +
  ggtitle("A. Distribution of Perceptual Errors")

###############
## Figure 2C ##
###############

# Plot distribution by politician ideology
fig.2c <- cmb.avg.with.mrps %>%
  ggplot(aes(x = lr_self, y = per_error)) +
  geom_hline(yintercept = 0) +
  geom_jitter(shape = 1, height = 0, width = 0.1) +
  geom_smooth(se = FALSE, color = "maroon") +
  scale_x_continuous(limits = c(0, 10), breaks = c(0:10)) +
  xlab("Politician's Ideological Self-Placement") +
  ylab("Perceptual Error") +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    panel.grid = element_blank()
  ) +
  ggtitle("C. Distribution of Perceptual Errors, by Politician's Ideological Self-Placement")

######################
## Arrange and Save ##
######################

# Arrange
fig.2a / fig.2b / fig.2c + plot_layout(heights = c(2, 2, 4))

# Save
ggsave("figures/main_fig2.pdf", width = 8.5, height = 9)

#########################################################
# Figure 3 in Main Text (Perceptual Error by Task Type) #
#########################################################

# Load data from perceived-distribution task
## Tokens data
token.value.data <- readRDS("data/token_value_data.RDS")

## Token-order data
token.order.data <- readRDS("data/token_order_data.RDS")

# Calculate error from point estimates and distribution means
token.value.data <- token.value.data %>%
  mutate(
    point_per_error = lr_point_avg_resident - bayes,
    dist_per_error = lr_dist_avg_resident - bayes
  )

###############
## Figure 3A ##
###############

# Estimate point-estimate error
mod.point <- lm(point_per_error ~ question_order, data = token.value.data)

pred.point <- mod.point %>%
  predictions(newdata = datagrid(question_order = "SPD")) %>%
  mutate(type = "Point\nEstimate")

# Estimate distribution error
pred.dist <- lm(dist_per_error ~ 1, data = token.value.data) %>%
  tidy(conf.int = T) %>%
  mutate(type = "Distribution\nMean")

# Merge predictions
preds <- bind_rows(pred.point, pred.dist)

# Plot predictions
fig.3a <- preds %>%
  ggplot(
    aes(x = type, y = estimate, ymin = conf.low, ymax = conf.high)
  ) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_pointrange() +
  coord_flip() +
  xlab("") +
  ylab("Average: Politician's Estimate - MRP Estimate") +
  scale_y_continuous(limits = c(0, 1)) +
  theme_minimal() +
  theme(panel.border = element_rect(fill = NA, linewidth = 0.5)) +
  ggtitle("A. Perceptual Error for Point Estimate vs. Perceived-Distribution Mean")

###############
## Figure 3B ##
###############

# Estimate point-estimate error by politician's ideology
point.mods.by.ideo <- token.value.data %>%
  filter(!is.na(lr_self_tri)) %>%
  group_by(lr_self_tri) %>%
  do(
    model = lm(point_per_error ~ question_order, data = .)
  ) %>%
  pull(model)

# Extract predictions and bind them
point.preds.by.ideo <- bind_rows(
  point.mods.by.ideo[[1]] %>%
    predictions(newdata = datagrid(question_order = "SPD")),
  point.mods.by.ideo[[2]] %>%
    predictions(newdata = datagrid(question_order = "SPD")),
  point.mods.by.ideo[[3]] %>%
    predictions(newdata = datagrid(question_order = "SPD"))
) %>%
  mutate(
    type = "Point\nEstimate", lr_self_tri = levels(token.value.data$lr_self_tri)
  )

# Estimate distribution error by politician's ideology
dist.preds.by.ideo <- token.value.data %>%
  filter(!is.na(lr_self_tri)) %>%
  group_by(lr_self_tri) %>%
  do(
    model = lm(dist_per_error ~ 1, data = .) %>%
      tidy(conf.int = T)
  ) %>%
  unnest(model) %>%
  mutate(type = "Distribution\nMean")

# Bind predictions
preds.by.ideo <- bind_rows(dist.preds.by.ideo, point.preds.by.ideo)

# Clean predicitons
preds.by.ideo <- preds.by.ideo %>%
  mutate(sig = as.numeric(p.value < 0.05)) %>%
  select(lr_self_tri, type, estimate, conf.low, conf.high, sig)

preds.by.ideo$lr_self_tri <- preds.by.ideo$lr_self_tri %>%
  factor(levels = c("Left", "Centre", "Right"))

# Plot predictions by politician's ideology
fig.3b <- preds.by.ideo %>%
  ggplot(
    aes(
      x = type,
      y = estimate,
      ymin = conf.low,
      ymax = conf.high,
      color = factor(sig)
    )
  ) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_pointrange() +
  coord_flip() +
  xlab("") +
  scale_color_manual(values = c("gray", "black")) +
  ylab("Average: Politician's Estimate - MRP Estimate") +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    strip.text = element_text(hjust = 0), legend.position = "none"
  ) +
  facet_wrap(~lr_self_tri, nrow = 1) +
  ggtitle("B. Perceptual Error for Point Estimate vs. Perceived-Distribution Mean, by Politician's Ideology")

###############
## Figure 3C ##
###############

# Run model
mod <- lm(point_per_error ~ point_before_dist, data = token.value.data)

# Get predictions
pred <- predictions(mod, newdata = datagrid(point_before_dist = c(0, 1)))

# Recode row IDs
pred$rowid <- pred$rowid %>%
  factor(
    labels = c(
      "Point Estimate After\nPerceived Distribution",
      "Point Estimate Before\nPerceived Distribution"
    )
  )

# Plot
fig.3c <- pred %>%
  ggplot(aes(x = rowid, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_pointrange() +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.25)) +
  coord_flip() +
  xlab("") +
  ylab("Average: Politician's Estimate - MRP Estimate") +
  theme_minimal() +
  ggtitle("C. Perceptual Error for Point Estimate, by Question Order") +
  theme(panel.border = element_rect(fill = NA, linewidth = 0.5))

######################
## Arrange and Save ##
######################

# Arrange
fig.3a / fig.3b / fig.3c + plot_layout(heights = c(1, 1, 1))

# Save
ggsave("figures/main_fig3.pdf", width = 10, height = 8)

##########################################################################
# Figure 4 in Main Text (Perceptual Error and Projection by Token Order) #
##########################################################################

###############
## Figure 4A ##
###############

# Get projection effects by token order
projection.dist <- token.order.data %>%
  filter(token_action == "ADD") %>%
  group_by(token_order) %>%
  do(
    model = lm(token_value ~ lr_self + bayes, data = .) %>%
      tidy(conf.int = T)
  ) %>%
  unnest(model) %>%
  filter(term == "lr_self") %>%
  filter(token_order <= 20) %>%
  mutate(type = "Perceived-Distribution\nTask")

# Get projection effects on point estimate
projection.point <- lm(
  lr_point_avg_resident ~ lr_self + bayes,
  data = token.value.data
) %>%
  tidy(, conf.int = T) %>%
  filter(term == "lr_self") %>%
  mutate(type = "Point-Estimate\nQuestion", token_order = 0)

# Bind projections
projection <- bind_rows(projection.dist, projection.point)

# Plot
fig.4a <- projection %>%
  ggplot(
    aes(
      x = token_order,
      y = estimate,
      ymin = conf.low,
      ymax = conf.high,
      group = type,
      color = type
    )
  ) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_pointrange() +
  geom_smooth(se = F, color = "black") +
  scale_x_continuous(limits = c(0, 20), breaks = c(1, 5, 10, 15, 20)) +
  theme_minimal() +
  scale_color_manual(values = c("gray", "#377eb8")) +
  ylab("Marginal Effect of\nPolitician's Ideology") +
  xlab("Token Placement Order") +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    panel.grid = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  ggtitle("A. Projection Effect by Token Placement Order")

###############
## Figure 4B ##
###############

# Calculate token-wise error
error.by.token.order.data <- token.order.data %>%
  filter(token_action == "ADD" & token_order <= 20) %>%
  mutate(error = token_value - bayes)

# Plot
fig.4b <- error.by.token.order.data %>%
  ggplot(aes(x = token_order, y = error)) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_smooth(color = "black", method = "gam") +
  facet_wrap(~lr_self_tri) +
  scale_x_continuous(limits = c(0, 20), breaks = c(1, 5, 10, 15, 20)) +
  ylab("Error") +
  xlab("Token Placement Order") +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    legend.position = "bottom",
    panel.grid = element_blank(),
    legend.title = element_blank(),
    strip.text = element_text(hjust = 0)
  ) +
  ggtitle("B. Perceptual Error by Token Placement Order")

###############
## Figure 4C ##
###############

# Calculate cumulative error by token order
## Initialize stash
cumulative <- NULL

## Filter to additions
cumul.data <- token.order.data %>%
  filter(token_action == "ADD")

# Iterate over token orders
for (i in 1:20) {
  # Filter data
  tmp <- cumul.data %>%
    filter(token_order <= i) %>%
    group_by(politician_id) %>%
    summarise(token_mean = mean(token_value)) %>%
    mutate(token_total = i)

  # Store results
  cumulative <- bind_rows(cumulative, tmp)
}

# Not sure how this works exactly
cumulative <- token.order.data %>%
  group_by(politician_id) %>%
  slice(1) %>%
  select(politician_id, bayes, lr_self_tri, lr_self) %>%
  left_join(cumulative, ., by = "politician_id") %>%
  mutate(error = token_mean - bayes)

# Plot
fig.4c <- cumulative %>%
  ggplot(aes(x = token_total, y = error)) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  scale_x_continuous(limits = c(0, 20), breaks = c(1, 5, 10, 15, 20)) +
  geom_smooth(color = "black", method = "gam") +
  facet_wrap(~lr_self_tri) +
  ylab("Cumulative Error") +
  xlab("Token Placement Order") +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    legend.position = "bottom",
    panel.grid = element_blank(),
    legend.title = element_blank(),
    strip.text = element_text(hjust = 0)
  ) +
  ggtitle("C. Cumulative Perceptual Error by Token Placement Order")

######################
## Arrange and Save ##
######################

# Arrange
fig.4a / fig.4b / fig.4c

# Save
ggsave("figures/main_fig4.pdf", width = 8.5, height = 9)

############################################################################
# Figure 1 in Supplementary Material (Multilevel Regression Model Results) #
############################################################################

# Load models
bayes.model <- readRDS("data/bayes_model.RDS")

# Unpack models
model <- bayes.model[[1]]
labels <- bayes.model[[2]]

# Create table
table <- mcmcTab(model) %>%
  left_join(., labels) %>%
  filter(!is.na(Category))

# Plot
table %>%
  ggplot(aes(x = Label, y = Median, ymin = Lower, ymax = Upper)) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_pointrange() +
  coord_flip() +
  xlab("") +
  ylab("") +
  facet_grid(Category ~ ., space = "free_y", scale = "free_y") +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank()
  )

# Save
ggsave("figures/sm_fig1.pdf", width = 8.5, height = 6)

################################################################
# Figure 2 in Supplementary Material (Municipal MRP Estimates) #
################################################################

# List ten biggest municipalities by population
biggest.csduids <- c(
  2466023,
  3506008,
  3520005,
  3521005,
  3521010,
  3525005,
  4611040,
  4806016,
  4811061,
  5915022
)

# Get csduids corresponding to min and max estimates
min.csduid <- mrp %>%
  subset(bayes == min(bayes)) %>%
  pull(csduid)

max.csduid <- mrp %>%
  subset(bayes == max(bayes)) %>%
  pull(csduid)

# Combine lists
set.csduids <- c(
  biggest.csduids,
  min.csduid,
  max.csduid
)

# Randomly select 20 other municipalities
random.csduids <- sample(
  setdiff(unique(mrp$csduid), set.csduids),
  size = 20,
  replace = FALSE
)

# Subset data
mrp.filtered <- mrp %>%
  subset(csduid %in% biggest.csduids | csduid %in% random.csduids) %>%
  select(csdname, bayes, bayes_lwr, bayes_upr) %>%
  arrange(csdname)

# Plot
mrp.filtered %>%
  ggplot(
    aes(
      x = reorder(csdname, bayes),
      y = bayes,
      ymin = bayes_lwr,
      ymax = bayes_upr
    )
  ) +
  geom_hline(yintercept = 5) +
  geom_pointrange(size = .4) +
  coord_flip() +
  ylab("Municipal Left-Right Ideology") +
  xlab("") +
  scale_y_continuous(limits = c(2.6, 7.4), breaks = c(3:7)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    panel.grid = element_blank()
  )

# Save
ggsave("figures/sm_fig2.pdf", width = 8.5, height = 6)

###################################################################
# Figure 3 in Supplementary Material (Ideology and Policy Issues) #
###################################################################

# Load data
issue.data <- readRDS("data/issue_data.RDS")

# Prepping data
prop.over.zero <- issue.data %>%
  group_by(policy, sign_both) %>%
  summarise(n = n()) %>%
  mutate(percentage = n / sum(n))

prop.over.zero$xpos <- rep(c(-3.7, 3.7, -3.7, 3.7), times = 6)
prop.over.zero$ypos <- rep(c(-0.9, -0.9, 0.9, 0.9), times = 6)

# Plot
issue.data %>%
  ggplot(aes(x = lr_point_per_error, y = policy_point_per_error)) +
  scale_y_continuous(limits = c(-1, 1)) +
  scale_x_continuous(limits = c(-4, 4)) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_point(alpha = 0.4, aes(color = sign_both)) +
  xlab("Perceptual Error: Ideology") +
  ylab("Perceptual Error: Policy Preferences") +
  geom_smooth(method = lm, se = F, color = "black") +
  facet_wrap(~policy) +
  geom_text(
    data = prop.over.zero,
    aes(
      label = percent(percentage, accuracy = 1L),
      color = sign_both,
      x = xpos,
      y = ypos
    ),
    hjust = 0.5,
    show.legend = F
  ) +
  scale_color_manual(
    values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a")
  ) +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    panel.grid = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  guides(color = "none")

# Save
ggsave("figures/sm_fig3.pdf", width = 8.5, height = 7)

##############################################################
# Figure 7 in Supplementary Material (Additional Validation) #
##############################################################

###############
## Figure 7A ##
###############

fig.sm7a <- token.value.data %>%
  filter(!is.na(lr_self_tri)) %>%
  ggplot(aes(x = point_per_error, y = dist_per_error)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_vline(xintercept = 0, linetype = "dashed") +
  geom_abline(intercept = 0, slope = 1) +
  geom_point(shape = 1, color = "gray") +
  geom_smooth(method = rlm) +
  facet_wrap(~lr_self_tri) +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    panel.grid = element_blank(),
    legend.position = "bottom",
    strip.text = element_text(hjust = 0),
    legend.title = element_blank()
  ) +
  labs(
    title = "A. Politician's Perceptual Error, by Question Type",
    x = "Point Estimate - MRP Estimate",
    y = "Perceived-Distribution Mean - MRP Estimate"
  )

###############
## Figure 7B ##
###############

# Get distribution of point-estimate error by question order
point.by.ques.order <- token.value.data %>%
  group_by(point_before_dist, lr_point_avg_resident) %>%
  summarise(n = n()) %>%
  mutate(percent = n / sum(n))

# Recode question-order column
point.by.ques.order$point_before_dist <- point.by.ques.order$point_before_dist %>%
  factor(
    labels = c(
      "Point Estimate After Perceived Distribution",
      "Point Estimate Before Perceived Distribution"
    )
  )

# Plot
fig.sm7b <- point.by.ques.order %>%
  ggplot(
    aes(
      x = factor(lr_point_avg_resident),
      y = percent,
      fill = point_before_dist
    )
  ) +
  geom_bar(stat = "identity", position = position_dodge()) +
  geom_text(
    aes(y = 0.01, label = percent(percent, accuracy = 1L)),
    position = position_dodge(width = .9)
  ) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.35), labels = percent) +
  scale_fill_brewer(type = "qual", palette = 2) +
  ylab("") +
  xlab("Point Estimate of Average Municipal Resident's Ideology") +
  theme_minimal() +
  theme(
    panel.border = element_rect(fill = NA, linewidth = 0.5),
    panel.grid.minor = element_blank(),
    legend.title = element_blank(),
    legend.position = "bottom"
  ) +
  ggtitle("B. Politicians' Point Estimates, by Question Order")

######################
## Arrange and Save ##
######################

# Arrange
fig.sm7a / fig.sm7b + plot_layout(heights = c(3, 2))

# Save
ggsave("figures/sm_fig7.pdf", width = 12, height = 9)
